Set the working dir
setwd("/mnt/picea/projects/spruce/sjansson/seasonal-needles/u2015030")
Libraries
suppressPackageStartupMessages(library(amap))
suppressPackageStartupMessages(library(corrplot))
suppressPackageStartupMessages(require(graphics))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(matrixStats))
Load table of counts vst transformed
load(file = "vst_aware.rda")
Focus on one year of sampling - samples past 2012-04-24 are the next generation of needles
sel <- ordered_dates %in% levels(ordered_dates)[1:(grep("2012-05",levels(ordered_dates))[1]-1)]
ordered_dates <- factor(as.character(ordered_dates)[sel])
vst_aware <- vst_aware[,sel]
Change the dates nomenclature
ddf <- do.call(rbind,strsplit(levels(ordered_dates),"-"))
labels <- ordered_dates
levels(labels) <- ifelse(as.integer(ddf[,3]) >= 25,
month.name[as.integer(ddf[,2])+1],
ifelse(as.integer(ddf[,3]) <= 10,
month.name[as.integer(ddf[,2])],
paste0("mid-",month.name[as.integer(ddf[,2])])
))
plotLabels <- as.character(labels[match(levels(ordered_dates),ordered_dates)])
plotLabels[duplicated(plotLabels)] <- ""
Combine the replicates
vst_bio_rep_mean <- do.call(cbind,lapply(split.data.frame(t(vst_aware),ordered_dates),colMeans))
vst_bio_rep_median <- do.call(cbind,lapply(split.data.frame(t(vst_aware),ordered_dates),colMedians))
rownames(vst_bio_rep_median) <- rownames(vst_bio_rep_mean)
vst_bio_rep_sd <- do.call(cbind,lapply(split.data.frame(t(vst_aware),ordered_dates),colSds))
vst_bio_rep_lwr <- vst_bio_rep_mean-qt(0.975,3)*vst_bio_rep_sd/sqrt(3)
“2011-06-30” has no replicates, thus it is impossible to calculate sd, lwr and upr replacement of NA values in upr and lwr by the mean to be able to plot the confidence interval of the mean later
vst_bio_rep_lwr[,"2011-06-30"] <- vst_bio_rep_mean[,"2011-06-30"]
vst_bio_rep_lwr[vst_bio_rep_lwr < 0] <- 0
vst_bio_rep_upr <- vst_bio_rep_mean+qt(0.975,3)*vst_bio_rep_sd/sqrt(3)
vst_bio_rep_upr[,"2011-06-30"] <- vst_bio_rep_mean[,"2011-06-30"]
Scale the data
s.vst <- t(scale(t(vst_aware)))
s.vst.mean <- do.call(cbind,lapply(split.data.frame(t(s.vst),ordered_dates),colMeans))
s.vst.median <- do.call(cbind,lapply(split.data.frame(t(s.vst),ordered_dates),colMedians))
rownames(s.vst.median) <- rownames(s.vst.mean)
s.vst.sd <- do.call(cbind,lapply(split.data.frame(t(s.vst),ordered_dates),colSds))
s.vst.lwr <- s.vst.mean-qt(0.975,3)*s.vst.sd/sqrt(3)
“2011-06-30” has no replicates, thus it is impossible to calculate sd, lwr and upr replacement of NA values in upr and lwr by the mean to be able to plot the confidence interval of the mean later
s.vst.lwr[,"2011-06-30"] <- s.vst.mean[,"2011-06-30"]
#s.vst.lwr[s.vst.lwr < 0] <- 0
s.vst.upr <- s.vst.mean+qt(0.975,3)*s.vst.sd/sqrt(3)
s.vst.upr[,"2011-06-30"] <- s.vst.upr[,"2011-06-30"]
read Gene Of Interest csv file
goi <- read.csv("~/Git/UPSCb/projects/spruce-needles/doc/GOI_list.csv", header = FALSE)
read GOI names file
goi_names <- read.csv("~/Git/UPSCb/projects/spruce-needles/doc/GOI_names.txt", header=FALSE)
Do a loop to plot the expression profile for each gene of the list
for (i in 1:nrow(goi)) {
message(i)
#check if goi are in vst with %in%
if (goi[i,1] %in% rownames(vst_aware)) {
g_median <- vst_bio_rep_median[rownames(vst_bio_rep_median) == goi[i,1]]
g_mean <- vst_bio_rep_mean[rownames(vst_bio_rep_mean) == goi[i,1]]
g_lwr <- vst_bio_rep_lwr[rownames(vst_bio_rep_lwr) == goi[i,1]]
g_upr <- vst_bio_rep_upr[rownames(vst_bio_rep_upr) == goi[i,1]]
DF <- data.frame(time=as.Date(levels(ordered_dates)),
mean=g_median,
lwr=g_lwr,
upr=g_upr)
p <- ggplot(DF, aes(time, group = 1)) +
annotate("rect", xmin=as.Date("2011-10-01"),
xmax=as.Date("2012-03-01"),
ymin=-Inf,ymax=+Inf,
fill="#ADD8E6",alpha=0.3) +
geom_line(aes(y=mean), color="blue") +
geom_ribbon(aes(ymin=lwr, ymax=upr), alpha=0.3) +
scale_x_date(date_breaks = "month") +
theme_bw() + coord_cartesian(ylim=c(0,max(vst_bio_rep_upr))) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(y = "vst counts", title = paste("Expression profile of", goi_names[i,1]))
plot(p)
suppressMessages(ggsave(filename = paste0("expression_profiles/",goi_names[i,1],"_expression_profile.jpeg")))
DF <- data.frame(time=as.Date(levels(ordered_dates)),
mean=s.vst.mean[rownames(s.vst.mean) == goi[i,1]],
lwr=s.vst.lwr[rownames(s.vst.lwr) == goi[i,1]],
upr=s.vst.upr[rownames(s.vst.upr) == goi[i,1]])
p <- ggplot(DF, aes(time, group = 1)) +
annotate("rect", xmin=as.Date("2011-10-01"),
xmax=as.Date("2012-03-01"),
ymin=-Inf,ymax=+Inf,
fill="#ADD8E6",alpha=0.3) +
geom_line(aes(y=mean), color="blue") +
geom_ribbon(aes(ymin=lwr, ymax=upr), alpha=0.3) +
scale_x_date(date_breaks = "month") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(y = "standard score") +
ggtitle(paste("Scaled expression profile of", goi_names[i,1]),
subtitle = paste("Average vst expression:",
round(mean(g_mean),digits=2)))
plot(p)
suppressMessages(ggsave(filename = paste0("expression_profiles/",goi_names[i,1],"_scaled_expression_profile.jpeg")))
}
}
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23
## 24
## 25
## 26
## 27
## 28
## 29
## 30
## 31
## 32
## 33
## 34
## 35
## 36
## 37
## 38
## 39
## 40
## 41
## 42
## 43
## 44
## 45
## 46
## 47
## 48
## 49
## 50
## 51
## 52
## 53
## 54
## 55
## 56
## 57
## 58
## 59
## 60
## 61
## 62
## 63
## 64
## 65
## 66
## 67
## 68
## 69
## 70
## 71
## 72
## 73
## 74
## 75
## 76
## 77
## 78
## 79
## 80
## 81
## 82
## 83
## 84
## 85
## 86
Remark ELIP_C does not match our data genes id ### Hierarchical clustering to see which genes have similar patterns Improve gene of interest data (add ELIP_C)
goi <- cbind(goi,goi_names)
colnames(goi) <- c("id","name")
select mean vst counts (of biological replicates for one time point) for gene of interest (goi)
vst_bio_rep_mean_goi <- vst_bio_rep_mean[rownames(vst_bio_rep_mean) %in% goi$id,]
replace gene ids by gene names as rownames
goi <- goi[! goi$name == "ELIP_C",]
goi <- goi[order(match(goi$id,rownames(vst_bio_rep_mean_goi))),]
rownames(vst_bio_rep_mean_goi) <- goi$name
Normalize data to obtain z-score to quantify only the variation around the mean (=pattern) and not the amplitude anymore for each each gene by doing so we can compare the genes relatively to their expression pattern and not relatively of their amplitude like before
vst_bio_rep_mean_goi_scaled <- t(scale(t(vst_bio_rep_mean_goi)))
Remove RabA1, which is not expressed
sel <- which(rowSums(is.na((vst_bio_rep_mean_goi_scaled))) > 0)
vst_bio_rep_mean_goi_scaled <- vst_bio_rep_mean_goi_scaled[-sel,]
vst_bio_rep_mean_goi <- vst_bio_rep_mean_goi[-sel,]
qqplot to check if our genes expression follows a Normal distribution or not
qqnorm(vst_bio_rep_mean_goi)
qqline(vst_bio_rep_mean_goi, col=3)
qqnorm(vst_bio_rep_mean_goi_scaled)
qqline(vst_bio_rep_mean_goi_scaled, col = 2)
==> does not follow a line, the distribution is not Normal ==> then Spearman correlation should be use because is not parametric Perform hierarchical clustering Firstly compute distance according to “spearman correlation” method Secondly compute clustering according to “complete” linkage
vst_bio_rep_mean_goi_cluster <- hcluster(vst_bio_rep_mean_goi, method = "spearman", link = "complete")
vst_bio_rep_mean_goi_scaled_cluster <- hcluster(vst_bio_rep_mean_goi_scaled, method = "spearman", link = "complete")
Plot the dendrogram
plot(vst_bio_rep_mean_goi_cluster,
main="Cluster Dendrogram of photosynthetic genes",
cex=0.8)
plot(vst_bio_rep_mean_goi_scaled_cluster,
main="Cluster Dendrogram of photosynthetic genes",
cex=0.8)
pdf(file="photosyntetic-genes-dendrogram.pdf",width=12,height=8)
plot(vst_bio_rep_mean_goi_scaled_cluster,
main="Cluster Dendrogram of photosynthetic genes",
cex=0.8,xlab="photosyntetic genes",sub="spearman distance, complete linkage")
dev.off()
## png
## 2
Plot the correlation
pdf(file = "expression_profiles/photosynthetic_genes_correlation_matrix.pdf")
corrplot(cor(t(vst_bio_rep_mean_goi)),method="square",order="hclust",
title = "Correlation matrix of photosynthetic genes",
tl.cex=0.4, cl.cex=0.5, tl.col="black", addrect=2, is.corr = FALSE, mar = c(0,0,2,0))
dev.off()
## png
## 2
corrplot(cor(t(vst_bio_rep_mean_goi_scaled)),method="square",order="hclust",
tl.cex=0.4, cl.cex=0.5, tl.col="black", addrect=2, is.corr = FALSE)
## R version 3.4.3 (2017-11-30)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/openblas-base/libblas.so.3
## LAPACK: /usr/lib/libopenblasp-r0.2.18.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] matrixStats_0.53.1 ggplot2_2.2.1 corrplot_0.84
## [4] amap_0.8-14
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.16 knitr_1.20 magrittr_1.5 munsell_0.4.3
## [5] colorspace_1.3-2 rlang_0.2.0 stringr_1.3.0 plyr_1.8.4
## [9] tools_3.4.3 grid_3.4.3 gtable_0.2.0 htmltools_0.3.6
## [13] yaml_2.1.18 lazyeval_0.2.1 rprojroot_1.3-2 digest_0.6.15
## [17] tibble_1.4.2 evaluate_0.10.1 rmarkdown_1.9 labeling_0.3
## [21] stringi_1.1.7 compiler_3.4.3 pillar_1.2.1 scales_0.5.0
## [25] backports_1.1.2